{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TRAINING UNWEIGHTED MULTICLASS BERT CLASSIFIER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialise relevant packages\n",
    "\n",
    "# Basics\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "\n",
    "# Preprocessing\n",
    "import torch\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Modelling\n",
    "from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments\n",
    "\n",
    "# Evaluation\n",
    "from sklearn.metrics import classification_report, f1_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load datasets\n",
    "training_data = pd.read_pickle('./Data/Clean Training Data/training_data_multiclass.pkl')\n",
    "\n",
    "df_raw = {}\n",
    "\n",
    "# write to dict\n",
    "for dataset in training_data:\n",
    "    df_raw[dataset] = training_data[dataset].copy() #.sample(n=10000, random_state=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "davidson2017\n",
      "label\n",
      "0     1430\n",
      "1    19190\n",
      "2     4163\n",
      "Name: text, dtype: int64\n",
      "\n",
      "founta2018\n",
      "label\n",
      "0     4965\n",
      "1    27150\n",
      "2    53851\n",
      "3    14030\n",
      "Name: text, dtype: int64\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# for multiclass: convert string label names into integer IDs\n",
    "\n",
    "df_raw['davidson2017'].label.replace({\"hateful\": 0, \"offensive\": 1, \"neither\": 2}, inplace = True)\n",
    "df_raw['founta2018'].label.replace({\"hateful\": 0, \"abusive\": 1, \"normal\": 2, \"spam\": 3}, inplace = True)\n",
    "\n",
    "for dataset in df_raw:\n",
    "    print(dataset)\n",
    "    print(df_raw[dataset].groupby('label').text.count())\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split each dataset into training and validation set\n",
    "df_train, df_valtest, df_val, df_test = {}, {}, {}, {}\n",
    "\n",
    "for dataset in df_raw:\n",
    "    df_train[dataset], df_valtest[dataset] = train_test_split(df_raw[dataset], test_size=0.2, stratify=df_raw[dataset].label, random_state=123)\n",
    "    df_val[dataset], df_test[dataset] = train_test_split(df_valtest[dataset], test_size=0.5, stratify=df_valtest[dataset].label, random_state=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split up text and label columns in dataframes into series for each dataset\n",
    "train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = {}, {}, {}, {}, {}, {}\n",
    "\n",
    "for dataset in df_raw:\n",
    "    train_texts[dataset] = df_train[dataset].text.astype(\"string\").tolist()\n",
    "    val_texts[dataset] = df_val[dataset].text.astype(\"string\").tolist()\n",
    "    test_texts[dataset] = df_test[dataset].text.astype(\"string\").tolist()\n",
    "    \n",
    "    train_labels[dataset] = df_train[dataset].label.tolist()\n",
    "    val_labels[dataset] = df_val[dataset].label.tolist()\n",
    "    test_labels[dataset] = df_test[dataset].label.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenize Texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import tokenizer\n",
    "tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')\n",
    "\n",
    "# add special tokens for URLs, emojis and mentions (--> see pre-processing)\n",
    "special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}\n",
    "num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenize text series for each dataset\n",
    "train_encodings, val_encodings, test_encodings = {}, {}, {}\n",
    "\n",
    "for dataset in df_raw:\n",
    "    train_encodings[dataset] = tokenizer(train_texts[dataset], truncation=True, padding=True)\n",
    "    val_encodings[dataset] = tokenizer(val_texts[dataset], truncation=True, padding=True)\n",
    "    test_encodings[dataset] = tokenizer(test_texts[dataset], truncation=True, padding=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create PyTorch Datasets "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "class HateDataset(torch.utils.data.Dataset):\n",
    "    def __init__(self, encodings, labels):\n",
    "        self.encodings = encodings\n",
    "        self.labels = labels\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
    "        item['labels'] = torch.tensor(self.labels[idx])\n",
    "        return item\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.labels)\n",
    "\n",
    "train_dataset, val_dataset, test_dataset = {}, {}, {}\n",
    "    \n",
    "for dataset in df_raw:\n",
    "    train_dataset[dataset] = HateDataset(train_encodings[dataset], train_labels[dataset])\n",
    "    val_dataset[dataset] = HateDataset(val_encodings[dataset], val_labels[dataset])\n",
    "    test_dataset[dataset] = HateDataset(test_encodings[dataset], test_labels[dataset])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train Unweighted Multiclass Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "Tesla K80\n",
      "4 GPUs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "device(type='cuda')"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check CUDA availability\n",
    "print(torch.cuda.is_available())\n",
    "print(torch.cuda.get_device_name())\n",
    "print(torch.cuda.device_count(), 'GPUs')\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define training arguments, matching weighted binary model (for which we did hyperparameter tuning)\n",
    "training_args = {}\n",
    "\n",
    "for dataset in df_raw:\n",
    "    training_args[dataset] = TrainingArguments(\n",
    "        save_steps = 2500,\n",
    "        output_dir='./Models/BERT_{}_multiclass/Checkpoints'.format(dataset), # output directory\n",
    "        num_train_epochs=3,              # total number of training epochs\n",
    "        per_device_train_batch_size=16,  # batch size per device during training\n",
    "        per_device_eval_batch_size=64,   # batch size for evaluation\n",
    "        evaluation_strategy = 'epoch',\n",
    "        warmup_steps=500,                # number of warmup steps for learning rate scheduler\n",
    "        weight_decay=0.01,               # strength of weight decay\n",
    "        learning_rate = 5e-5,\n",
    "        seed = 123\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define explicit model initialisation. Different functions for each dataset to have correct number of labels (could be more elegant)\n",
    "def model_init_D17(dataset):\n",
    "    model = BertForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=3)\n",
    "    # resize to match tokenizer length with special tokens added above\n",
    "    model.resize_token_embeddings(len(tokenizer))\n",
    "    return model\n",
    "\n",
    "def model_init_F18(dataset):\n",
    "    model = BertForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=4)\n",
    "    # resize to match tokenizer length with special tokens added above\n",
    "    model.resize_token_embeddings(len(tokenizer))\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "# Instantiate trainer objects for each dataset\n",
    "trainer = {}\n",
    "\n",
    "for dataset in df_raw:\n",
    "    if dataset == 'davidson2017':\n",
    "        trainer[dataset] = Trainer(\n",
    "            args=training_args[dataset],                  \n",
    "            train_dataset=train_dataset[dataset],         \n",
    "            eval_dataset=val_dataset[dataset],            \n",
    "            model_init = model_init_D17\n",
    "        )\n",
    "    if dataset == 'founta2018':\n",
    "        trainer[dataset] = Trainer(\n",
    "            args=training_args[dataset],                  \n",
    "            train_dataset=train_dataset[dataset],         \n",
    "            eval_dataset=val_dataset[dataset],            \n",
    "            model_init = model_init_F18\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train models for each dataset\n",
    "for dataset in trainer:\n",
    "    print('Training multiclass {} BERT model'.format(dataset))\n",
    "    trainer[dataset].train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save Model and Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset in trainer:    \n",
    "    trainer[dataset].save_model('./Models/BERT_{}_multiclass/Final'.format(dataset))\n",
    "    tokenizer.save_pretrained('./Models/BERT_{}_multiclass/Final'.format(dataset))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reload Models\n",
    "So that models can be evaluated on test set even after kernel resets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load fine-tuned models\n",
    "models = {}\n",
    "\n",
    "for dataset in ['davidson2017','founta2018']:\n",
    "        models[dataset] = BertForSequenceClassification.from_pretrained(\"./Models/BERT_{}_multiclass/Final\".format(dataset))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instantiate trainer objects for each model (already fine-tuned so no longer necessary to specify training and eval data)\n",
    "# output directory is redundant because there is no further training but needs to be specified anyway\n",
    "\n",
    "trainer = {}\n",
    "\n",
    "for model in models:\n",
    "    trainer[model] = Trainer(\n",
    "        model=models[model],         \n",
    "        args=TrainingArguments(\n",
    "            output_dir='./Models/BERT_{}_multiclass/Test'.format(model),\n",
    "            per_device_eval_batch_size = 64)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate Models on Test Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate each model on its corresponding test set\n",
    "\n",
    "results = {}\n",
    "\n",
    "for dataset in trainer:\n",
    "    print('Evaluating multiclass {} BERT model on test data'.format(dataset))\n",
    "    results[dataset] = trainer[dataset].predict(test_dataset[dataset])\n",
    "    for metric in results[dataset].metrics:\n",
    "        print(metric, results[dataset].metrics['{}'.format(metric)])\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DAVIDSON2017\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.49      0.37      0.42       143\n",
      "           1       0.93      0.96      0.95      1919\n",
      "           2       0.91      0.87      0.89       417\n",
      "\n",
      "    accuracy                           0.91      2479\n",
      "   macro avg       0.78      0.73      0.75      2479\n",
      "weighted avg       0.90      0.91      0.91      2479\n",
      "\n",
      "\n",
      "FOUNTA2018\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.53      0.40      0.45       497\n",
      "           1       0.85      0.93      0.89      2715\n",
      "           2       0.86      0.86      0.86      5385\n",
      "           3       0.62      0.56      0.59      1403\n",
      "\n",
      "    accuracy                           0.82     10000\n",
      "   macro avg       0.72      0.69      0.70     10000\n",
      "weighted avg       0.81      0.82      0.81     10000\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# write predictions to series\n",
    "pred_labels={}\n",
    "\n",
    "for dataset in trainer:\n",
    "\n",
    "    preds=[]\n",
    "    \n",
    "    for row in results[dataset][0]:\n",
    "        preds.append(int(np.argmax(row)))\n",
    "    \n",
    "    pred_labels[dataset] = pd.Series(preds)\n",
    "\n",
    "# print classification reports for each model\n",
    "\n",
    "for dataset in trainer:\n",
    "        print(dataset.upper(), 'multiclass')\n",
    "        print(classification_report(test_labels[dataset],pred_labels[dataset]))\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DAVIDSON2017\n",
      "micro F1 score: 91.09%\n",
      "macro F1 score: 75.25%\n",
      "weighted F1 score: 90.70%\n",
      "\n",
      "FOUNTA2018\n",
      "micro F1 score: 81.67%\n",
      "macro F1 score: 69.92%\n",
      "weighted F1 score: 81.21%\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# f1 scores\n",
    "for dataset in trainer:\n",
    "        print(dataset.upper())\n",
    "        for average in ['micro', 'macro', 'weighted']:\n",
    "            print('{} F1 score: {:.2%}'.format(average, f1_score(test_labels[dataset],pred_labels[dataset], average=average)))\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DAVIDSON2017\n",
      "1    1971\n",
      "2     399\n",
      "0     109\n",
      "dtype: int64\n",
      "\n",
      "FOUNTA2018\n",
      "2    5369\n",
      "1    2977\n",
      "3    1280\n",
      "0     374\n",
      "dtype: int64\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for dataset in trainer:\n",
    "        print(dataset.upper())\n",
    "        print(pred_labels[dataset].value_counts())\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
